{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://huggingface.co/datasets/nvidia/ChatRAG-Bench/viewer/hybridial"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = \"hybridial\"\n",
    "df = load_dataset(\"nvidia/ChatRAG-Bench\", name)[\"test\"].to_pandas()\n",
    "print(df.messages.apply(len).max())\n",
    "print(df.ctxs.apply(len).max())\n",
    "df.loc[\n",
    "    (df.messages.apply(len) >= df.messages.apply(len).max() // 2)\n",
    "    & (df.ctxs.apply(len) >= df.ctxs.apply(len).max() // 2)\n",
    "].sample(10, random_state=42).to_csv(f\"{name}_samples.csv.gz\", index=False)\n",
    "df.sample(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://huggingface.co/datasets/nvidia/ChatRAG-Bench/viewer/sqa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = \"sqa\"\n",
    "df = load_dataset(\"nvidia/ChatRAG-Bench\", name)[\"test\"].to_pandas()\n",
    "print(df.messages.apply(len).max())\n",
    "df.loc[df.messages.apply(len) >= df.messages.apply(len).max() // 2].sample(\n",
    "    100, random_state=42\n",
    ").to_csv(f\"{name}_samples.csv.gz\", index=False)\n",
    "df.sample(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://huggingface.co/datasets/nvidia/ChatRAG-Bench/viewer/doqa_cooking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = \"doqa_cooking\"\n",
    "df = load_dataset(\"nvidia/ChatRAG-Bench\", name)[\"test\"].to_pandas()\n",
    "print(df.messages.apply(len).max())\n",
    "df.loc[df.messages.apply(len) >= df.messages.apply(len).max() // 2].sample(\n",
    "    100, random_state=42\n",
    ").to_csv(f\"{name}_samples.csv.gz\", index=False)\n",
    "df.sample(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://huggingface.co/datasets/nvidia/ChatRAG-Bench/viewer/doqa_travel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = \"doqa_travel\"\n",
    "df = load_dataset(\"nvidia/ChatRAG-Bench\", name)[\"test\"].to_pandas()\n",
    "print(df.messages.apply(len).max())\n",
    "df.loc[df.messages.apply(len) >= df.messages.apply(len).max() // 2].sample(\n",
    "    100, random_state=42\n",
    ").to_csv(f\"{name}_samples.csv.gz\", index=False)\n",
    "df.sample(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = \"doqa_movies\"\n",
    "df = load_dataset(\"nvidia/ChatRAG-Bench\", name)[\"test\"].to_pandas()\n",
    "print(df.messages.apply(len).max())\n",
    "df.loc[df.messages.apply(len) >= df.messages.apply(len).max() // 2].sample(\n",
    "    100, random_state=42\n",
    ").to_csv(f\"{name}_samples.csv.gz\", index=False)\n",
    "df.sample(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://huggingface.co/datasets/nvidia/ChatQA-Training-Data/viewer/synthetic_convqa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = \"synthetic_convqa\"\n",
    "df = load_dataset(\"nvidia/ChatQA-Training-Data\", name)[\"train\"].to_pandas()\n",
    "print(df.messages.apply(len).max())\n",
    "df.loc[df.messages.apply(len) >= df.messages.apply(len).max() // 2].sample(\n",
    "    100, random_state=42\n",
    ").to_csv(f\"{name}_samples.csv.gz\", index=False)\n",
    "df.sample(10)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
